{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "attractive-module",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存在问题\n",
    "# 多次尝试发现标题无法获取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "whole-calgary",
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "阿里研究院\n",
    "阿里健康\n",
    "阿里巴巴商学院\n",
    "阿里数据\n",
    "\n",
    "腾讯金融科技\n",
    "腾讯研究院\n",
    "腾讯媒体研究院\n",
    "腾讯云启研究院\n",
    "酷鹅用户研究院\n",
    "'''\n",
    "公众号 = \"图书情报专硕圈\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "demonstrated-tribe",
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = { \"output\" : { \"公众号_htm_snippets\": \"data_raw_src/公众号_htm_snippets_{公众号}.tsv\",\n",
    "                    \"公众号_df\": \"data_raw_src/公众号_df_{公众号}.tsv\",\n",
    "                    \"公众号_xlsx\": \"data_sets/公众号_url_{公众号}.xlsx\" } \\\n",
    "      }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "quarterly-suffering",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "painful-bronze",
   "metadata": {},
   "outputs": [],
   "source": [
    "import selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "superb-frost",
   "metadata": {},
   "outputs": [],
   "source": [
    "from selenium import webdriver\n",
    "import time\n",
    "\n",
    "wd = webdriver.Chrome()\n",
    "wd.get(\"https://www.baidu.com\")    # 打开百度浏览器\n",
    "wd.find_element_by_id(\"kw\").send_keys(\"selenium\")   # 定位输入框并输入关键字\n",
    "wd.find_element_by_id(\"su\").click()   #点击[百度一下]搜索\n",
    "time.sleep(3)   #等待3秒\n",
    "wd.quit()   #关闭浏览器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "painted-disposition",
   "metadata": {},
   "outputs": [],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "smoking-double",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-5-5f738edac08d>:15: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox') #解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "baking-strand",
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "systematic-briefing",
   "metadata": {},
   "outputs": [],
   "source": [
    "payload =  {\"account\": \"IreneCchen@163.com\", \"password\": \"EvenWong&irene\"}\n",
    "\n",
    "# 切换为账号密码登录\n",
    "element = driver.find_element_by_xpath('//div[@class=\"login__type__container login__type__container__scan\"]/a').click()\n",
    "# element.get_attribute('innerHTML')\n",
    "# element.click()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "straight-motivation",
   "metadata": {},
   "outputs": [],
   "source": [
    "## 清空账号input\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').clear()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "double-instrumentation",
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').send_keys(payload['account'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "careful-analyst",
   "metadata": {},
   "outputs": [],
   "source": [
    "## 清空密码input\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').clear()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "visible-solid",
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "innovative-triangle",
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//div[@class=\"login_btn_panel\"]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "psychological-center",
   "metadata": {},
   "outputs": [],
   "source": [
    "## 展开\n",
    "element = driver.find_element_by_xpath('//*[@id=\"mp_header\"]/div/div/a')\n",
    "element.click()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "consolidated-vault",
   "metadata": {},
   "outputs": [],
   "source": [
    "## 点击图文素材\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_mp_sidemenu\"]/div/div/ul/li[2]/ul/li[1]/ul/li[1]/a')\n",
    "element.click()\n",
    "# //*[@id=\"js_mp_sidemenu_pop\"]/div[2]/div/div/ul/li[2]/ul/li[1]/ul/li[1]/a\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "advisory-absorption",
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div[1]/i')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "offensive-negative",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击新建图文\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div[2]/ul/li[1]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "canadian-terrorism",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CDwindow-32C91B221AFEACACEC5442A3CFC93504', 'CDwindow-F210D565CCE0512F053CA4F7ABC39B6D']\n"
     ]
    }
   ],
   "source": [
    "print (driver.window_handles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "advanced-lounge",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-20-6c6d5ce6602d>:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "statewide-jordan",
   "metadata": {},
   "outputs": [],
   "source": [
    "## 点击超链接\n",
    "element = driver.find_element_by_xpath('//div[@class=\"mp-head\"]//div[@class=\"media_list_box_inner\"]/ul[@class=\"tpl_list\"]/li[@id=\"js_editor_insertlink\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "superb-angel",
   "metadata": {},
   "outputs": [],
   "source": [
    "## 点击选择其他公众号\n",
    "element = driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/p/div/button')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "liberal-death",
   "metadata": {},
   "outputs": [],
   "source": [
    "## 点击\n",
    "driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div/span/input').clear()\n",
    "driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div/span/input').send_keys(\"图书情报专硕圈\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "communist-participant",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-icon weui-desktop-icon__search weui-desktop-icon__small\" style=\"width: 20px; height: 20px;\"><!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!---->     <svg viewBox=\"0 0 24 24\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><title>MP/Icon/Search</title> <g id=\"MP/Icon/Search\" stroke=\"none\" stroke-width=\"1\" fill=\"none\" fill-rule=\"evenodd\"><path d=\"M5.78025253,5.78248558 C8.51392257,3.04881554 12.9460774,3.04881554 15.6797475,5.78248558 C18.1730922,8.27583028 18.3922898,12.1821488 16.3373403,14.9239313 L20.6294949,19.2175144 L19.2152814,20.631728 L14.922508,16.3389663 C12.180685,18.394566 8.27384272,18.1755707 5.78025253,15.6819805 C3.04658249,12.9483105 3.04658249,8.51615562 5.78025253,5.78248558 Z M6.8409127,6.84314575 C4.6930291,8.99102935 4.6930291,12.4734367 6.8409127,14.6213203 C8.98879631,16.7692039 12.4712037,16.7692039 14.6190873,14.6213203 C16.7669709,12.4734367 16.7669709,8.99102935 14.6190873,6.84314575 C12.4712037,4.69526215 8.98879631,4.69526215 6.8409127,6.84314575 Z\" id=\"形状\"></path></g></svg> <!----> <!----> <!----> <!----> <!----></div>\n"
     ]
    }
   ],
   "source": [
    "## 点击放大镜\n",
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-icon-btn weui-desktop-search__btn\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "# ActionChains(driver).move_to_element(element).click().perform()  \n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "worthy-sweden",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/FnFTDAaPV2B6z5L2VzJ17OAMqk7bZA4K6HMMlGqnRibwvYDxgmI4kV9Oo8vnRHQjZXDiaLZ8YQ7CyGzSdGb7Rr6w/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">图书情报专硕圈</strong> <i class=\"inner_link_account_wechat\">微信号：r125500</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/0Nk7yYNqGvAOKibu3RMNLib8Vkutgm73y22iakMaLu6s7mYjcEnvGyxKr0KlQSEAFq4AlXK7cH6ru9XKgaRibnn4CA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">图书情报专硕集</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/jkPknkWuz4f37ZIPaAFZ32h6vkVzxTqutmIVk327Oia5PXMK8NJiaic9btMH0TXdiaEcyAib7MKcNKiadTD60VSib7lbw/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">会计专硕MPAcc</strong> <i class=\"inner_link_account_wechat\">微信号：mpacc985</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/KbDuADA44WiaicraCv93ZSlPobgiaOib6aTOTXjfib13yVsQ26EwIVe4PjC7FOicWjUucy1lyKw1BGd0rv3rIm4ibxXOA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">图书情报专硕</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/nf8TbBb33icgBwvfsPj14W0EHNR2urSib7m9oBnBDQ78O5XvHOcmxw8WfZmTaQ34bLbh8KUbV6f09OlutibuOjTdA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">兰州社科MBA MPA专硕</strong> <i class=\"inner_link_account_wechat\">微信号：lanzhousuccess</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "公众号SERP = main_content\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "enhanced-petersburg",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 解析\n",
    "root = fromstring(公众号SERP) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "silent-praise",
   "metadata": {},
   "outputs": [],
   "source": [
    "主 = root.xpath('//li[@class=\"inner_link_account_item\"]')\n",
    "\n",
    "account_list = []\n",
    "for e in 主:\n",
    "    account_nickname = e.xpath('./div/strong[@class=\"inner_link_account_nickname\"]')[0].text\n",
    "    account_wechat = e.xpath('./div/i[@class=\"inner_link_account_wechat\"]')[0].text\n",
    "    account_img = e.xpath('./div/img/@src')[0]\n",
    "    account = {\"nickname\": account_nickname, \"wechat\": account_wechat, \"img\": account_img,}\n",
    "    account_list.append(account)\n",
    "\n",
    "df_account = pd.DataFrame(account_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "independent-louisville",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nickname</th>\n",
       "      <th>wechat</th>\n",
       "      <th>img</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>图书情报专硕圈</td>\n",
       "      <td>微信号：r125500</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/FnFTDAaPV2B6z5L...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>图书情报专硕集</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/0Nk7yYNqGvAOKib...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>会计专硕MPAcc</td>\n",
       "      <td>微信号：mpacc985</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/jkPknkWuz4f37ZI...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>图书情报专硕</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/KbDuADA44Wiaicr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>兰州社科MBA MPA专硕</td>\n",
       "      <td>微信号：lanzhousuccess</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/nf8TbBb33icgBwv...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        nickname              wechat  \\\n",
       "0        图书情报专硕圈         微信号：r125500   \n",
       "1        图书情报专硕集             微信号：未设置   \n",
       "2      会计专硕MPAcc        微信号：mpacc985   \n",
       "3         图书情报专硕             微信号：未设置   \n",
       "4  兰州社科MBA MPA专硕  微信号：lanzhousuccess   \n",
       "\n",
       "                                                 img  \n",
       "0  http://mmbiz.qpic.cn/mmbiz_png/FnFTDAaPV2B6z5L...  \n",
       "1  http://mmbiz.qpic.cn/mmbiz_png/0Nk7yYNqGvAOKib...  \n",
       "2  http://mmbiz.qpic.cn/mmbiz_png/jkPknkWuz4f37ZI...  \n",
       "3  http://mmbiz.qpic.cn/mmbiz_png/KbDuADA44Wiaicr...  \n",
       "4  http://mmbiz.qpic.cn/mmbiz_png/nf8TbBb33icgBwv...  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "packed-label",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/FnFTDAaPV2B6z5L2VzJ17OAMqk7bZA4K6HMMlGqnRibwvYDxgmI4kV9Oo8vnRHQjZXDiaLZ8YQ7CyGzSdGb7Rr6w/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">图书情报专硕圈</strong> <i class=\"inner_link_account_wechat\">微信号：r125500</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]/li')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "developing-depth",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n跳转_input = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/input\\')\\n跳转_a = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/a\\')\\n跳转_input.clear()\\n跳转_input.send_keys(2)\\n跳转_a.click()\\n'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 跳转testing\n",
    "'''\n",
    "跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "跳转_input.clear()\n",
    "跳转_input.send_keys(2)\n",
    "跳转_a.click()\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "marine-douglas",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 127]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# 跳转上限\n",
    "l_e = driver.find_elements_by_xpath('//label[@class=\"weui-desktop-pagination__num\"]')\n",
    "l_e_int  = [int(x.text) for x in l_e] \n",
    "print (l_e_int)\n",
    "print (l_e_int[0]==l_e_int[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "changing-valley",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(l_e_int[0],l_e_int[-1]+1 ))\n",
    "#print(pages[0:2])\n",
    "pages = list(range(1,l_e_int[-1]+1 ))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "treated-capacity",
   "metadata": {},
   "outputs": [],
   "source": [
    "# global varialbes \n",
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "oriental-correspondence",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "\n",
    "        time.sleep(20+1*random())\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        #print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "gentle-greeting",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t50\t51\t52\t53\t54\t55\t56\t57\t58\t59\t60\t61\t62\t63\t64\t65\t66\t67\t68\t69\t70\t71\t72\t73\t74\t75\t76\t77\t78\t79\t80\t81\t82\t83\t84\t85\t86\t87\t88\t89\t90\t91\t92\t93\t94\t95\t96\t97\t98\t99\t100\t101\t102\t103\t104\t105\t106\t107\t108\t109\t110\t111\t112\t113\t114\t115\t116\t117\t118\t119\t120\t121\t122\t123\t124\t125\t126\t127\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "velvet-subcommittee",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "127"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# current\n",
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "subject-brush",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>123</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>124</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>125</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>126</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>127 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         html_snippets\n",
       "1    <div class=\"weui-desktop-radio-group\"><label c...\n",
       "2    <div class=\"weui-desktop-radio-group\"><label c...\n",
       "3    <div class=\"weui-desktop-radio-group\"><label c...\n",
       "4    <div class=\"weui-desktop-radio-group\"><label c...\n",
       "5    <div class=\"weui-desktop-radio-group\"><label c...\n",
       "..                                                 ...\n",
       "123  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "124  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "125  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "126  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "127  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "\n",
       "[127 rows x 1 columns]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "center-china",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stored 'html_raw' (dict)\n"
     ]
    }
   ],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "animated-monte",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "111\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>114</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>115</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>120</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>122</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>123</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>124</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>125</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>126</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         html_snippets\n",
       "12   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "113  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "114  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "115  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "116  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "117  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "118  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "119  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "120  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "121  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "122  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "123  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "124  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "125  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "126  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "127  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "static-paint",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[12, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[12, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "middle-suspect",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = fn [\"output\"] [\"公众号_htm_snippets\"] \n",
    "df_out.to_csv(filename.format(公众号=公众号), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "distinct-innocent",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12,15,19,18,28,29,18,18,10,6,20,15,22,7,15,11,7,8,23,11,8,20,16,9,7,7,5,10,5,5,5,6,9,10,16,21,17,9,19,13,9,6,8,11,11,14,21,17,24,23,22,13,20,24,24,20,20,16,7,13,7,5,9,32,20,5,8,6,7,7,13,13,13,8,11,5,5,7,7,5,19,6,6,6,12,6,11,11,15,17,10,14,14,15,14,14,19,21,22,17,8,11,14,11,6,8,9,10,10,6,7,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,"
     ]
    }
   ],
   "source": [
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x.text for x in root.xpath('//div[@class=\"inner_link_article_title\"]')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\": create_time, \"link\":link})\n",
    "    return(_df_)\n",
    "    \n",
    "l_df = []\n",
    "for p in pages:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "enhanced-omega",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   title create_time                                               link\n",
       "0   None  2021-05-29  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "1   None  2021-05-29  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "2   None  2021-05-29  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "3   None  2021-05-29  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "4   None  2021-05-29  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "5   None  2021-05-30  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "6   None  2021-05-30  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "7   None  2021-05-28  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "8   None  2021-05-28  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "9   None  2021-05-28  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "10  None  2021-05-28  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg..."
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out.loc[0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "adjusted-possible",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1553</th>\n",
       "      <td>None</td>\n",
       "      <td>2018-10-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1554</th>\n",
       "      <td>None</td>\n",
       "      <td>2018-10-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1555</th>\n",
       "      <td>None</td>\n",
       "      <td>2018-10-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1556</th>\n",
       "      <td>None</td>\n",
       "      <td>2018-10-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1557</th>\n",
       "      <td>None</td>\n",
       "      <td>2018-10-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     title create_time                                               link\n",
       "1553  None  2018-10-26  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "1554  None  2018-10-26  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "1555  None  2018-10-26  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "1556  None  2018-10-26  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "1557  None  2018-10-25  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg..."
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out.tail(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "republican-reasoning",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "\"None of [Index([None, None, None, None, None, None, None, None, None, None,\\n       ...\\n       None, None, None, None, None, None, None, None, None, None],\\n      dtype='object', length=1558)] are in the [columns]\"",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-59-f08375286eaf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtag\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtagging_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m     \u001b[0mindex_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_url_out\u001b[0m \u001b[0;34m[\u001b[0m \u001b[0mdf_url_out\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtitle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontains\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtag\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     19\u001b[0m     \u001b[0mv_v_pairs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0mtag\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mindex_list\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmelt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"value\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     20\u001b[0m     \u001b[0mv_v_list\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv_v_pairs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3028\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mis_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3029\u001b[0m                 \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3030\u001b[0;31m             \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_listlike_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3031\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3032\u001b[0m         \u001b[0;31m# take() does not accept boolean indexers\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m   1264\u001b[0m             \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnew_indexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0max\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reindex_non_unique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeyarr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1265\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1266\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_read_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mraise_missing\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1267\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1268\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m   1306\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mmissing\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1307\u001b[0m                 \u001b[0maxis_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1308\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"None of [{key}] are in the [{axis_name}]\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1309\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1310\u001b[0m             \u001b[0max\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: \"None of [Index([None, None, None, None, None, None, None, None, None, None,\\n       ...\\n       None, None, None, None, None, None, None, None, None, None],\\n      dtype='object', length=1558)] are in the [columns]\""
     ]
    }
   ],
   "source": [
    "# tagging 标记\n",
    "tagging_list = [\"\",\"南京农业大学\", \"研究方向\", \"学制学费\",\"招生人数\",\"信息科技学院\",\"211工程\",\"信息管理系\",\"2020年\",\\\n",
    "                \"院校排名\",\"2020年\",\\\n",
    "                \"扬州大学\",\"淮扬文化研究中心\",\"历年分数线\",\"招生人数\",\\\n",
    "                \"湘潭大学\",\"湖南省\",\"公共管理学院\",\"历年分数线\",\"2020年\",\\\n",
    "                \"370万\", \"管综\",\"三科联考\",\"英语二\",\"MBA\",\\\n",
    "                \"吉首大学\", \"2020年\",\\\n",
    "                \"中南大学\", \"复试录取\",\\\n",
    "                \"云南大学\",\"历年分数线\", \\\n",
    "                \"山西财经大学\", \"研究方向\", \"学制学费\",\"招生人数\",\"太原市\",\"情报学\",\\\n",
    "                \"四川大学\", \"录取复试\",\"2019年\",\"2020年\",\"成都\",\"研究方向\",\\\n",
    "                \"辽宁大学\",\"双一流\",\"东北人民政府\",\"朱德同志\",\"历年分数线\",\\\n",
    "                \"研究生\", \"2022年\", \"就业压力\", \"疫情\", \"招生规模\", \"国家大力发展\", \"教育部要求\"] #overwritable\n",
    "\n",
    "v_v_list = []\n",
    "\n",
    "for tag in tagging_list:\n",
    "    index_list = df_url_out [ df_url_out.title.str.contains(tag) ].index.to_list()\n",
    "    v_v_pairs = pd.DataFrame({tag:index_list}).melt().set_index(\"value\")\n",
    "    v_v_list.append(v_v_pairs)\n",
    "\n",
    "df_cat = v_v_list[0]\n",
    "for d in v_v_list:\n",
    "    df_cat.update(d)\n",
    "    \n",
    "# 尚未标记内容\n",
    "df_url_out.loc [ df_cat.query('variable==\"\"').index ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "residential-product",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg==&mid=2247494716&idx=5&sn=56f686bc341b676604c513c52a16ed00&chksm=f9906bd7cee7e2c1fa08f786a8cd895342f6f5a7e1c359df854d8d1e8154eafa62f378aa51ea#rd'"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out.loc[53].link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "political-barrel",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>193</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-03-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>194</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-03-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>195</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-03-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>196</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-03-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>197</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-03-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1553</th>\n",
       "      <td>None</td>\n",
       "      <td>2018-10-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1554</th>\n",
       "      <td>None</td>\n",
       "      <td>2018-10-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1555</th>\n",
       "      <td>None</td>\n",
       "      <td>2018-10-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1556</th>\n",
       "      <td>None</td>\n",
       "      <td>2018-10-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1557</th>\n",
       "      <td>None</td>\n",
       "      <td>2018-10-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>150 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     title create_time                                               link\n",
       "193   None  2021-03-17  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "194   None  2021-03-17  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "195   None  2021-03-16  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "196   None  2021-03-16  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "197   None  2021-03-16  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "...    ...         ...                                                ...\n",
       "1553  None  2018-10-26  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "1554  None  2018-10-26  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "1555  None  2018-10-26  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "1556  None  2018-10-26  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "1557  None  2018-10-25  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "\n",
       "[150 rows x 3 columns]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "breeding-orbit",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>None</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684</th>\n",
       "      <td>None</td>\n",
       "      <td>2020-07-09</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>685</th>\n",
       "      <td>None</td>\n",
       "      <td>2020-07-10</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>686</th>\n",
       "      <td>None</td>\n",
       "      <td>2020-07-10</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>687</th>\n",
       "      <td>None</td>\n",
       "      <td>2020-07-10</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>688</th>\n",
       "      <td>None</td>\n",
       "      <td>2020-07-10</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>689 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    title create_time                                               link\n",
       "0    None  2021-05-29  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "1    None  2021-05-29  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "2    None  2021-05-29  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "3    None  2021-05-29  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "4    None  2021-05-29  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "..    ...         ...                                                ...\n",
       "684  None  2020-07-09  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "685  None  2020-07-10  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "686  None  2020-07-10  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "687  None  2020-07-10  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "688  None  2020-07-10  http://mp.weixin.qq.com/s?__biz=MzUxNzc4ODI5Mg...\n",
       "\n",
       "[689 rows x 3 columns]"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[~df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "binary-myrtle",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'df_cat' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-58-f7c6f29cb6dc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_o\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_url_out\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_cat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"无法分类\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mdf_o\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'df_cat' is not defined"
     ]
    }
   ],
   "source": [
    "df_o = df_url_out.join(df_cat).replace(\"\", np.nan).fillna(\"无法分类\")\n",
    "df_o"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "thick-hawaiian",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'df_o' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-83-5487f1d8ec41>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_o\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf_o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtitle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontains\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"2020年\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'df_o' is not defined"
     ]
    }
   ],
   "source": [
    "df_o[df_o.title.str.contains(\"2020年\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "removable-monday",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'df_o' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-57-91093fb9ff87>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_stats\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mby\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"variable\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0magg\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"title\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"count\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mby\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"title\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mdf_stats\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'df_o' is not defined"
     ]
    }
   ],
   "source": [
    "df_stats = df_o.groupby(by=\"variable\").agg({\"title\":\"count\"}).sort_values(by=\"title\", ascending=False)\n",
    "df_stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "compressed-alexandria",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输出"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "generous-psychiatry",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'df_o' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-56-28f128f87184>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mdf_account\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"rel_accounts\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf_o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"url_cat\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mdf_stats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"stats\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'df_o' is not defined"
     ]
    }
   ],
   "source": [
    "df_account.columns.name = \"rel_accounts\"\n",
    "df_o.columns.name = \"url_cat\"\n",
    "df_stats.columns.name = \"stats\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "hollywood-intro",
   "metadata": {},
   "outputs": [],
   "source": [
    "_df_.columns.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "heated-zealand",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the xlsxwriter workbook and worksheet objects.  \n",
    "with pd.ExcelWriter(fn[\"output\"][\"公众号_xlsx\"].format(公众号=公众号)) as writer:\n",
    "    workbook  = writer.book\n",
    "\n",
    "    for _df_ in [df_account, df_o, df_stats]:\n",
    "        _df_.to_excel(writer, sheet_name = _df_.columns.name)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "personalized-nirvana",
   "metadata": {},
   "outputs": [],
   "source": [
    "                  \"公众号_xlsx\": \"data_sets/{公众号}_url.xlsx\" } \\\n",
    " "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
